spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python.
"Tokens" are usually individual words (at least in languages like English) and "tokenization" is taking a text or set of text and breaking it up into individual its words. These tokens are then used as the input for other types of analysis or tasks, like parsing (automatically tagging the syntactic relationship between words). We need to tokenize word so that we can use it for our title generating model/other cool analysis (source)
#simple example
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"An apple is not a banana")
for token in doc:
print(token.text)
| 0 | 1 | 2 | 3 | 4 | 5 |
| An | apple | is | not | a | banana |
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
parser = English()
def spacy_tokenizer(sentence):
mytokens = parser(sentence)
mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
mytokens = " ".join([i for i in mytokens])
return mytokens
tqdm.pandas()
normal = three_countries["title"][three_countries["popular"] == 0].progress_apply(spacy_tokenizer)
popular = three_countries["title"][three_countries["popular"] == 1].progress_apply(spacy_tokenizer)
#tokenize words by popularity
def word_generator(text):
word = list(text.split())
return word
def bigram_generator(text):
bgram = list(nltk.bigrams(text.split()))
bgram = [' '.join((a, b)) for (a, b) in bgram]
return bgram
def trigram_generator(text):
tgram = list(nltk.trigrams(text.split()))
tgram = [' '.join((a, b, c)) for (a, b, c) in tgram]
return tgram
normal_words = normal.progress_apply(word_generator)
popular_words = popular.progress_apply(word_generator)
normal_bigrams = normal.progress_apply(bigram_generator)
popular_bigrams = popular.progress_apply(bigram_generator)
normal_trigrams = normal.progress_apply(trigram_generator)
popular_trigrams = popular.progress_apply(trigram_generator)
#function that makes a pretty word frequency plot
def word_plot(words,my_color):
slist =[]
for x in words:
slist.extend(x)
fig = plt.figure(figsize=(15, 10))
pd.Series(slist).value_counts()[:20].sort_values(ascending=True).plot(kind='barh',fontsize=20, color=my_color)
plt.show()
word_plot(popular_words,'blue')
word_plot(popular_bigrams,'orange')
word_plot(popular_trigrams,'red')
Tf-idf analyzes the impact of tokens (words) throughout the whole documents. For example, the more times a word appears in a document (each title), the more weight it will have. However, the more documents (titles) the word appears in, it is 'penalized' and the weight is diminished because it is empirically less informative than features that occur in a small fraction of the training corpus (source)
txt1 = ['I like banana', 'An apple is not a banana', 'banana banana oh banana']
tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word')
txt_fitted = tf.fit(txt1)
txt_transformed = txt_fitted.transform(txt1)
print ("The text: ", txt1)
tf.vocabulary_
idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names(), idf)))
print("\nThe token 'banana' appears 5 times but it is also in all documents, so its idf is the lowest")
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(three_countries.title)
word_features = word_vectorizer.transform(three_countries.title)
classifier_popular = LogisticRegression(C=0.1, solver='sag')
classifier_popular.fit(word_features ,three_countries.popular)
names=['normal','popular']
c_tf = make_pipeline( word_vectorizer,classifier_popular)
explainer_tf = LimeTextExplainer(class_names=names)
exp = explainer_tf.explain_instance(three_countries.title.iloc[10], c_tf.predict_proba, num_features=4, top_labels=1)
exp.show_in_notebook(text=three_countries.title.iloc[10])
exp = explainer_tf.explain_instance(three_countries.title.iloc[4], c_tf.predict_proba, num_features=5, top_labels=1)
exp.show_in_notebook(text=three_countries.title.iloc[4])
exp = explainer_tf.explain_instance(three_countries.title.iloc[10035], c_tf.predict_proba, num_features=5, top_labels=1)
exp.show_in_notebook(text=three_countries.title.iloc[10035])
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
labels = list(three_countries.category_names.value_counts().index.values)
values = list(three_countries.category_names.value_counts().values)
trace = go.Pie(labels=labels, values=values)
iplot([trace], filename='basic_pie_chart')
three_countries.groupby('category_names')['views'].describe()
entertainment_title= three_countries["title"][(three_countries['category_names'] == 'Entertainment')]
news_politics_title= three_countries["title"][(three_countries['category_names'] == 'News & Politics')]
people_title= three_countries["title"][(three_countries['category_names'] == 'People & Blogs')]
music_title= three_countries["title"][(three_countries['category_names'] == 'Music')]
sports_title= three_countries["title"][(three_countries['category_names'] == 'Sports')]
comedy_title= three_countries["title"][(three_countries['category_names'] == 'Comedy')]
vectorizer_entertainment_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
entertainment_title_vectorized = vectorizer_entertainment_title.fit_transform(entertainment_title)
lda_popular_entertainment_title_vectorized = LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
entertainment_title_vectorized_lda = lda_popular_entertainment_title_vectorized.fit_transform(entertainment_title_vectorized )
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_popular_entertainment_title_vectorized,entertainment_title_vectorized, vectorizer_entertainment_title, mds='tsne')
dash
vectorizer_news_politics_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
news_politics_title_vectorized = vectorizer_news_politics_title.fit_transform(news_politics_title)
lda_news_politics_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
news_politics_title_vectorized_lda = lda_news_politics_title_vectorized.fit_transform(news_politics_title_vectorized )
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_news_politics_title_vectorized,news_politics_title_vectorized, vectorizer_news_politics_title , mds='tsne')
dash
vectorizer_people_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
people_title_vectorized = vectorizer_people_title.fit_transform(people_title)
lda_people_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
people_title_vectorized_lda = lda_people_title_vectorized.fit_transform(people_title_vectorized )
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_people_title_vectorized,people_title_vectorized, vectorizer_people_title , mds='tsne')
dash
vectorizer_music_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
music_title_vectorized = vectorizer_music_title.fit_transform(music_title)
lda_music_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
music_title_vectorized_lda = lda_music_title_vectorized.fit_transform(music_title_vectorized )
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_music_title_vectorized,music_title_vectorized, vectorizer_music_title , mds='tsne')
dash
vectorizer_sports_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
sports_title_vectorized = vectorizer_sports_title.fit_transform(sports_title)
lda_sports_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
sports_title_vectorized_lda = lda_sports_title_vectorized.fit_transform(sports_title_vectorized )
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_sports_title_vectorized,sports_title_vectorized, vectorizer_sports_title , mds='tsne')
dash
vectorizer_comedy_title = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
comedy_title_vectorized = vectorizer_comedy_title.fit_transform(comedy_title)
lda_comedy_title_vectorized= LatentDirichletAllocation(n_components=7, max_iter=5, learning_method='online',verbose=True)
comedy_title_vectorized_lda = lda_comedy_title_vectorized.fit_transform(comedy_title_vectorized )
pyLDAvis.enable_notebook()
dash = pyLDAvis.sklearn.prepare(lda_comedy_title_vectorized,comedy_title_vectorized, vectorizer_comedy_title , mds='tsne')
dash